-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[Instcombine]: llvm.ucmp and llvm.scmp recognition
#168505
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-llvm-transforms Author: None (kper) ChangesCreated a pattern to recognize Closes #166579 Full diff: https://github.com/llvm/llvm-project/pull/168505.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 9572f9d702e1b..5c8008700e181 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1955,6 +1955,48 @@ static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
return nullptr;
}
+/// Transform
+///
+/// select(icmp(eq, X, Y), Z, select(icmp(ult, X, Y), -1, 1))
+/// into select(icmp(eq, X, Y), Z, llvm.ucmp(freeze(X), freeze(Y)))
+///
+/// or
+///
+/// select(icmp(eq, X, Y), Z, select(icmp(slt, X, Y), -1, 1))
+/// into select(icmp(eq, X, Y), Z, llvm.scmp(freeze(X), freeze(Y)))
+static Value *foldSelectToInstrincCmp(SelectInst &SI, const ICmpInst *ICI,
+ Value *TrueVal, Value *FalseVal,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+
+ if (Pred != ICmpInst::ICMP_EQ)
+ return nullptr;
+
+ CmpPredicate IPred;
+ if (match(FalseVal, m_Select(m_ICmp(IPred, m_Specific(ICI->getOperand(0)),
+ m_Specific(ICI->getOperand(1))),
+ m_AllOnes(), m_One())) &&
+ (IPred == ICmpInst::ICMP_ULT || IPred == ICmpInst::ICMP_SLT)) {
+ Value *X = ICI->getOperand(0);
+ Value *Y = ICI->getOperand(1);
+ Builder.SetInsertPoint(&SI);
+ auto IID = IPred == ICmpInst::ICMP_ULT ? Intrinsic::ucmp : Intrinsic::scmp;
+
+ // Edge Case: if Z is the constant 0 then the select can be folded
+ // to just the instrinsic comparison.
+ if (match(TrueVal, m_Zero()))
+ return Builder.CreateIntrinsic(SI.getType(), IID, {X, Y});
+
+ Value *FrozenX = Builder.CreateFreeze(X, X->getName() + ".frz");
+ Value *FrozenY = Builder.CreateFreeze(Y, Y->getName() + ".frz");
+ Value *Cmp =
+ Builder.CreateIntrinsic(FrozenX->getType(), IID, {FrozenX, FrozenY});
+ return Builder.CreateSelect(SI.getCondition(), TrueVal, Cmp, "select.ucmp");
+ }
+
+ return nullptr;
+}
+
/// Fold `X Pred C1 ? X BOp C2 : C1 BOp C2` to `min/max(X, C1) BOp C2`.
/// This allows for better canonicalization.
Value *InstCombinerImpl::foldSelectWithConstOpToBinOp(ICmpInst *Cmp,
@@ -2186,6 +2228,9 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
if (Value *V = foldSelectWithConstOpToBinOp(ICI, TrueVal, FalseVal))
return replaceInstUsesWith(SI, V);
+ if (Value *V = foldSelectToInstrincCmp(SI, ICI, TrueVal, FalseVal, Builder))
+ return replaceInstUsesWith(SI, V);
+
return Changed ? &SI : nullptr;
}
diff --git a/llvm/test/Transforms/InstCombine/select-cmp.ll b/llvm/test/Transforms/InstCombine/select-cmp.ll
index b1bd7a0ecc8ac..bf1a6cb047c37 100644
--- a/llvm/test/Transforms/InstCombine/select-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-cmp.ll
@@ -808,5 +808,119 @@ define i1 @icmp_lt_slt(i1 %c, i32 %arg) {
ret i1 %select
}
+define i16 @icmp_fold_to_llvm_ucmp_when_eq(i16 %x, i16 %y) {
+; CHECK-LABEL: @icmp_fold_to_llvm_ucmp_when_eq(
+; CHECK-NEXT: [[Y_FRZ:%.*]] = freeze i16 [[Y:%.*]]
+; CHECK-NEXT: [[X_FRZ:%.*]] = freeze i16 [[X:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i16 [[X_FRZ]], [[Y_FRZ]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.ucmp.i16.i16(i16 [[X_FRZ]], i16 [[Y_FRZ]])
+; CHECK-NEXT: [[SELECT_UCMP:%.*]] = select i1 [[TMP1]], i16 42, i16 [[TMP2]]
+; CHECK-NEXT: ret i16 [[SELECT_UCMP]]
+;
+ %3 = icmp eq i16 %x, %y
+ %4 = icmp ult i16 %x, %y
+ %5 = select i1 %4, i16 -1, i16 1
+ %6 = select i1 %3, i16 42, i16 %5
+ ret i16 %6
+}
+
+define i16 @icmp_fold_to_llvm_ucmp_when_ult_and_Z_zero(i16 %x, i16 %y) {
+; CHECK-LABEL: @icmp_fold_to_llvm_ucmp_when_ult_and_Z_zero(
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.ucmp.i16.i16(i16 [[X:%.*]], i16 [[Y:%.*]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %3 = icmp eq i16 %x, %y
+ %4 = icmp ult i16 %x, %y
+ %5 = select i1 %4, i16 -1, i16 1
+ %6 = select i1 %3, i16 0, i16 %5
+ ret i16 %6
+}
+
+define i16 @icmp_fold_to_llvm_ucmp_when_slt_and_Z_zero(i16 %x, i16 %y) {
+; CHECK-LABEL: @icmp_fold_to_llvm_ucmp_when_slt_and_Z_zero(
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.scmp.i16.i16(i16 [[X:%.*]], i16 [[Y:%.*]])
+; CHECK-NEXT: ret i16 [[TMP1]]
+;
+ %3 = icmp eq i16 %x, %y
+ %4 = icmp slt i16 %x, %y
+ %5 = select i1 %4, i16 -1, i16 1
+ %6 = select i1 %3, i16 0, i16 %5
+ ret i16 %6
+}
+
+define i16 @icmp_fold_to_llvm_ucmp_when_cmp_slt(i16 %x, i16 %y) {
+; CHECK-LABEL: @icmp_fold_to_llvm_ucmp_when_cmp_slt(
+; CHECK-NEXT: [[Y_FRZ:%.*]] = freeze i16 [[Y:%.*]]
+; CHECK-NEXT: [[X_FRZ:%.*]] = freeze i16 [[X:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i16 [[X_FRZ]], [[Y_FRZ]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.scmp.i16.i16(i16 [[X_FRZ]], i16 [[Y_FRZ]])
+; CHECK-NEXT: [[SELECT_UCMP:%.*]] = select i1 [[TMP1]], i16 42, i16 [[TMP2]]
+; CHECK-NEXT: ret i16 [[SELECT_UCMP]]
+;
+ %3 = icmp eq i16 %x, %y
+ %4 = icmp slt i16 %x, %y ; here "ult" changed to "slt"
+ %5 = select i1 %4, i16 -1, i16 1
+ %6 = select i1 %3, i16 42, i16 %5
+ ret i16 %6
+}
+
+define i16 @icmp_fold_to_llvm_ucmp_when_value(i16 %x, i16 %y, i16 %Z) {
+; CHECK-LABEL: @icmp_fold_to_llvm_ucmp_when_value(
+; CHECK-NEXT: [[Y_FRZ:%.*]] = freeze i16 [[Y:%.*]]
+; CHECK-NEXT: [[X_FRZ:%.*]] = freeze i16 [[X:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i16 [[X_FRZ]], [[Y_FRZ]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.ucmp.i16.i16(i16 [[X_FRZ]], i16 [[Y_FRZ]])
+; CHECK-NEXT: [[SELECT_UCMP:%.*]] = select i1 [[TMP1]], i16 [[Z:%.*]], i16 [[TMP2]]
+; CHECK-NEXT: ret i16 [[SELECT_UCMP]]
+;
+ %3 = icmp eq i16 %x, %y
+ %4 = icmp ult i16 %x, %y
+ %5 = select i1 %4, i16 -1, i16 1
+ %6 = select i1 %3, i16 %Z, i16 %5
+ ret i16 %6
+}
+
+define i16 @icmp_fold_to_llvm_ucmp_when_ne(i16 %x, i16 %y) {
+; CHECK-LABEL: @icmp_fold_to_llvm_ucmp_when_ne(
+; CHECK-NEXT: [[Y_FRZ:%.*]] = freeze i16 [[Y:%.*]]
+; CHECK-NEXT: [[X_FRZ:%.*]] = freeze i16 [[X:%.*]]
+; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i16 [[X_FRZ]], [[Y_FRZ]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.ucmp.i16.i16(i16 [[X_FRZ]], i16 [[Y_FRZ]])
+; CHECK-NEXT: [[SELECT_UCMP:%.*]] = select i1 [[DOTNOT]], i16 42, i16 [[TMP1]]
+; CHECK-NEXT: ret i16 [[SELECT_UCMP]]
+;
+ %3 = icmp ne i16 %x, %y
+ %4 = icmp ult i16 %x, %y
+ %5 = select i1 %4, i16 -1, i16 1
+ %6 = select i1 %3, i16 %5, i16 42
+ ret i16 %6
+}
+
+define i16 @icmp_fold_to_llvm_ucmp_negative_test_invalid_constant_1(i16 %x, i16 %y, i16 %Z) {
+; CHECK-LABEL: @icmp_fold_to_llvm_ucmp_negative_test_invalid_constant_1(
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i16 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 [[Z:%.*]], i16 1
+; CHECK-NEXT: ret i16 [[TMP2]]
+;
+ %3 = icmp eq i16 %x, %y
+ %4 = icmp ult i16 %x, %y
+ %5 = select i1 %4, i16 1, i16 1 ; invalid constant
+ %6 = select i1 %3, i16 %Z, i16 %5
+ ret i16 %6
+}
+
+define i16 @icmp_fold_to_llvm_ucmp_negative_test_invalid_constant_2(i16 %x, i16 %y, i16 %Z) {
+; CHECK-LABEL: @icmp_fold_to_llvm_ucmp_negative_test_invalid_constant_2(
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i16 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 [[Z:%.*]], i16 -1
+; CHECK-NEXT: ret i16 [[TMP2]]
+;
+ %3 = icmp eq i16 %x, %y
+ %4 = icmp ult i16 %x, %y
+ %5 = select i1 %4, i16 -1, i16 -1 ; invalid constant
+ %6 = select i1 %3, i16 %Z, i16 %5
+ ret i16 %6
+}
+
declare void @use(i1)
declare void @use.i8(i8)
|
|
@dtcxzyw could you run the benchmarks to see whether this transformation is profitable? |
🐧 Linux x64 Test Results
|
|
@dtcxzyw thanks, I fixed two bugs. First, the return type of the intrinsic wasn't correct. Second, the fold must not be applied to pointer comparisons. |
dtcxzyw
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the first pattern, s/ucmp is less profitable than select + icmp because the former one needs two icmp + two selects/one sub (See also TargetLowering::expandCMP). It doesn't simplify the IR since X and Y still have two uses after the transformation.
For the second pattern select(icmp(eq, X, Y), 0, llvm.cmp(X, Y)) -> llvm.cmp(X, Y), absorbing the equality test into s/ucmp is interesting. Unfortunately we haven't seen its existence in real-world programs.
Created a pattern to recognize
llvm.ucmpandllvm.scmp.Alive Proof: https://alive2.llvm.org/ce/z/BYRyu-
Closes #166579